#==============================================================================#
# 1-extract-profiles-bd.R
# Matt Curtis 27/06/22
# mjdcurtis@gmail.com
#------------------------------------------------------------------------------#
# 
#==============================================================================#
library(arrow)
library(dplyr)

#------------------------------------------------------------------------------#
rm(list = ls())
gc()

# set working directory once

setwd("~/Replication package/")
csv_file <- "./1_raw_data/1_2_geni/geni_union_details_patched.csv"
dest <- "./1_raw_data/1_2_geni/1_2_geni_chunked/geni_union_details" 

sch<-schema(
  union_id = string(),
  marriage_range = string(),
  marriage_start_day  = double(),
  marriage_start_month = double(),
  marriage_start_year = double(),
  marriage_start_circa = string(),
  marriage_end_day = double(),
  marriage_end_month = double(),
  marriage_end_year = double(),
  marriage_end_circa = string(),
  marriage_city = string(),
  marriage_state = string(),
  marriage_county = string(),
  marriage_country = string(),
  marriage_place_name = string(),
  divorce_range = string(),
  divorce_start_day = double(),
  divorce_start_month = double(),
  divorce_start_year = double(),
  divorce_start_circa = string(),
  divorce_end_day = double(),
  divorce_end_month = double(),
  divorce_end_year = double(),
  divorce_end_circa = string(),
  divorce_city = string(),
  divorce_state = string(),
  divorce_county = string(),
  divorce_country = string(),
  divorce_place_name = string(),
)

csv_stream <- open_csv_dataset(csv_file,schema=sch,skip=1)

write_dataset(csv_stream, dest, format = "parquet", 
              max_rows_per_file=5000000L,
              hive_style = TRUE,
              existing_data_behavior = "overwrite")